JASON STOPAS
MODULE 04: LAB 01
import gdown
import polars as pl
import pandas as pd
import numpy as np
import sqlite3
import plotly.express as px
import plotly.graph_objects as go
drive_loc = 'https://drive.google.com/uc?id=1V2GCHGt2dkFGqVBeoUFckU4IhUgk4ocQ'
download_LC = 'downloaded_file.csv'
gdown.download(drive_loc, download_LC, quiet=False)
Jobs = pl.read_csv(download_LC)
Downloading... From (original): https://drive.google.com/uc?id=1V2GCHGt2dkFGqVBeoUFckU4IhUgk4ocQ From (redirected): https://drive.google.com/uc?id=1V2GCHGt2dkFGqVBeoUFckU4IhUgk4ocQ&confirm=t&uuid=1d8c7256-c988-4711-b880-980200dc25ab To: C:\Users\jtsto\OneDrive\Documents\03 BU SPRING 2025\AD 688\Mod04_Lab01\downloaded_file.csv 100%|██████████| 717M/717M [01:04<00:00, 11.1MB/s]
Jobs.head()
| ID | LAST_UPDATED_DATE | LAST_UPDATED_TIMESTAMP | DUPLICATES | POSTED | EXPIRED | DURATION | SOURCE_TYPES | SOURCES | URL | ACTIVE_URLS | ACTIVE_SOURCES_INFO | TITLE_RAW | BODY | MODELED_EXPIRED | MODELED_DURATION | COMPANY | COMPANY_NAME | COMPANY_RAW | COMPANY_IS_STAFFING | EDUCATION_LEVELS | EDUCATION_LEVELS_NAME | MIN_EDULEVELS | MIN_EDULEVELS_NAME | MAX_EDULEVELS | MAX_EDULEVELS_NAME | EMPLOYMENT_TYPE | EMPLOYMENT_TYPE_NAME | MIN_YEARS_EXPERIENCE | MAX_YEARS_EXPERIENCE | IS_INTERNSHIP | SALARY | REMOTE_TYPE | REMOTE_TYPE_NAME | ORIGINAL_PAY_PERIOD | SALARY_TO | SALARY_FROM | … | SOC_2021_5_NAME | LOT_CAREER_AREA | LOT_CAREER_AREA_NAME | LOT_OCCUPATION | LOT_OCCUPATION_NAME | LOT_SPECIALIZED_OCCUPATION | LOT_SPECIALIZED_OCCUPATION_NAME | LOT_OCCUPATION_GROUP | LOT_OCCUPATION_GROUP_NAME | LOT_V6_SPECIALIZED_OCCUPATION | LOT_V6_SPECIALIZED_OCCUPATION_NAME | LOT_V6_OCCUPATION | LOT_V6_OCCUPATION_NAME | LOT_V6_OCCUPATION_GROUP | LOT_V6_OCCUPATION_GROUP_NAME | LOT_V6_CAREER_AREA | LOT_V6_CAREER_AREA_NAME | SOC_2 | SOC_2_NAME | SOC_3 | SOC_3_NAME | SOC_4 | SOC_4_NAME | SOC_5 | SOC_5_NAME | LIGHTCAST_SECTORS | LIGHTCAST_SECTORS_NAME | NAICS_2022_2 | NAICS_2022_2_NAME | NAICS_2022_3 | NAICS_2022_3_NAME | NAICS_2022_4 | NAICS_2022_4_NAME | NAICS_2022_5 | NAICS_2022_5_NAME | NAICS_2022_6 | NAICS_2022_6_NAME |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| str | str | str | i64 | str | str | i64 | str | str | str | str | str | str | str | str | i64 | i64 | str | str | bool | str | str | i64 | str | i64 | str | i64 | str | i64 | i64 | bool | i64 | i64 | str | str | i64 | i64 | … | str | i64 | str | i64 | str | i64 | str | i64 | str | i64 | str | i64 | str | i64 | str | i64 | str | str | str | str | str | str | str | str | str | str | str | i64 | str | i64 | str | i64 | str | i64 | str | i64 | str |
| "1f57d95acf4dc67ed2819eb12f049f… | "9/6/2024" | "2024-09-06 20:32:57.352 Z" | 0 | "6/2/2024" | "6/8/2024" | 6 | "[ "Company" ]" | "[ "brassring.com" ]" | "[ "https://sjobs.brassring.c… | "[]" | null | "Enterprise Analyst (II-III)" | "31-May-2024 Enterprise Analys… | "6/8/2024" | 6 | 894731 | "Murphy USA" | "Murphy USA" | false | "[ 2 ]" | "[ "Bachelor's degree" ]" | 2 | "Bachelor's degree" | null | null | 1 | "Full-time (> 32 hours)" | 2 | 2 | false | null | 0 | "[None]" | null | null | null | … | "Data Scientists" | 23 | "Information Technology and Com… | 231010 | "Business Intelligence Analyst" | 23101011 | "General ERP Analyst / Consulta… | 2310 | "Business Intelligence" | 23101011 | "General ERP Analyst / Consulta… | 231010 | "Business Intelligence Analyst" | 2310 | "Business Intelligence" | 23 | "Information Technology and Com… | "15-0000" | "Computer and Mathematical Occu… | "15-2000" | "Mathematical Science Occupatio… | "15-2050" | "Data Scientists" | "15-2051" | "Data Scientists" | "[ 7 ]" | "[ "Artificial Intelligence" … | 44 | "Retail Trade" | 441 | "Motor Vehicle and Parts Dealer… | 4413 | "Automotive Parts, Accessories,… | 44133 | "Automotive Parts and Accessori… | 441330 | "Automotive Parts and Accessori… |
| "0cb072af26757b6c4ea9464472a50a… | "8/2/2024" | "2024-08-02 17:08:58.838 Z" | 0 | "6/2/2024" | "8/1/2024" | null | "[ "Job Board" ]" | "[ "maine.gov" ]" | "[ "https://joblink.maine.gov… | "[]" | null | "Oracle Consultant - Reports (3… | "Oracle Consultant - Reports (3… | "8/1/2024" | null | 133098 | "Smx Corporation Limited" | "SMX" | true | "[ 99 ]" | "[ "No Education Listed" ]" | 99 | "No Education Listed" | null | null | 1 | "Full-time (> 32 hours)" | 3 | 3 | false | null | 1 | "Remote" | null | null | null | … | "Data Scientists" | 23 | "Information Technology and Com… | 231010 | "Business Intelligence Analyst" | 23101012 | "Oracle Consultant / Analyst" | 2310 | "Business Intelligence" | 23101012 | "Oracle Consultant / Analyst" | 231010 | "Business Intelligence Analyst" | 2310 | "Business Intelligence" | 23 | "Information Technology and Com… | "15-0000" | "Computer and Mathematical Occu… | "15-2000" | "Mathematical Science Occupatio… | "15-2050" | "Data Scientists" | "15-2051" | "Data Scientists" | null | null | 56 | "Administrative and Support and… | 561 | "Administrative and Support Ser… | 5613 | "Employment Services" | 56132 | "Temporary Help Services" | 561320 | "Temporary Help Services" |
| "85318b12b3331fa490d32ad014379d… | "9/6/2024" | "2024-09-06 20:32:57.352 Z" | 1 | "6/2/2024" | "7/7/2024" | 35 | "[ "Job Board" ]" | "[ "dejobs.org" ]" | "[ "https://dejobs.org/dallas… | "[]" | null | "Data Analyst" | "Taking care of people is at th… | "6/10/2024" | 8 | 39063746 | "Sedgwick" | "Sedgwick" | false | "[ 2 ]" | "[ "Bachelor's degree" ]" | 2 | "Bachelor's degree" | null | null | 1 | "Full-time (> 32 hours)" | 5 | null | false | null | 0 | "[None]" | null | null | null | … | "Data Scientists" | 23 | "Information Technology and Com… | 231113 | "Data / Data Mining Analyst" | 23111310 | "Data Analyst" | 2311 | "Data Analysis and Mathematics" | 23111310 | "Data Analyst" | 231113 | "Data / Data Mining Analyst" | 2311 | "Data Analysis and Mathematics" | 23 | "Information Technology and Com… | "15-0000" | "Computer and Mathematical Occu… | "15-2000" | "Mathematical Science Occupatio… | "15-2050" | "Data Scientists" | "15-2051" | "Data Scientists" | null | null | 52 | "Finance and Insurance" | 524 | "Insurance Carriers and Related… | 5242 | "Agencies, Brokerages, and Othe… | 52429 | "Other Insurance Related Activi… | 524291 | "Claims Adjusting" |
| "1b5c3941e54a1889ef4f8ae55b401a… | "9/6/2024" | "2024-09-06 20:32:57.352 Z" | 1 | "6/2/2024" | "7/20/2024" | 48 | "[ "Job Board" ]" | "[ "disabledperson.com", "d… | "[ "https://www.disabledperso… | "[]" | null | "Sr. Lead Data Mgmt. Analyst - … | "About this role: Wells Fargo … | "6/12/2024" | 10 | 37615159 | "Wells Fargo" | "Wells Fargo" | false | "[ 99 ]" | "[ "No Education Listed" ]" | 99 | "No Education Listed" | null | null | 1 | "Full-time (> 32 hours)" | 3 | null | false | null | 0 | "[None]" | null | null | null | … | "Data Scientists" | 23 | "Information Technology and Com… | 231113 | "Data / Data Mining Analyst" | 23111310 | "Data Analyst" | 2311 | "Data Analysis and Mathematics" | 23111310 | "Data Analyst" | 231113 | "Data / Data Mining Analyst" | 2311 | "Data Analysis and Mathematics" | 23 | "Information Technology and Com… | "15-0000" | "Computer and Mathematical Occu… | "15-2000" | "Mathematical Science Occupatio… | "15-2050" | "Data Scientists" | "15-2051" | "Data Scientists" | "[ 6 ]" | "[ "Data Privacy/Protection" … | 52 | "Finance and Insurance" | 522 | "Credit Intermediation and Rela… | 5221 | "Depository Credit Intermediati… | 52211 | "Commercial Banking" | 522110 | "Commercial Banking" |
| "cb5ca25f02bdf25c13edfede793150… | "6/19/2024" | "2024-06-19 07:00:00.000 Z" | 0 | "6/2/2024" | "6/17/2024" | 15 | "[ "FreeJobBoard" ]" | "[ "craigslist.org" ]" | "[ "https://modesto.craigslis… | "[]" | null | "Comisiones de $1000 - $3000 po… | "Comisiones de $1000 - $3000 po… | "6/17/2024" | 15 | 0 | "Unclassified" | "LH/GM" | false | "[ 99 ]" | "[ "No Education Listed" ]" | 99 | "No Education Listed" | null | null | 3 | "Part-time / full-time" | null | null | false | 92500 | 0 | "[None]" | "year" | 150000 | 35000 | … | "Data Scientists" | 23 | "Information Technology and Com… | 231010 | "Business Intelligence Analyst" | 23101012 | "Oracle Consultant / Analyst" | 2310 | "Business Intelligence" | 23101012 | "Oracle Consultant / Analyst" | 231010 | "Business Intelligence Analyst" | 2310 | "Business Intelligence" | 23 | "Information Technology and Com… | "15-0000" | "Computer and Mathematical Occu… | "15-2000" | "Mathematical Science Occupatio… | "15-2050" | "Data Scientists" | "15-2051" | "Data Scientists" | null | null | 99 | "Unclassified Industry" | 999 | "Unclassified Industry" | 9999 | "Unclassified Industry" | 99999 | "Unclassified Industry" | 999999 | "Unclassified Industry" |
Jobs.shape
(72498, 131)
columns = Jobs.columns
for col in columns:
print(col)
ID LAST_UPDATED_DATE LAST_UPDATED_TIMESTAMP DUPLICATES POSTED EXPIRED DURATION SOURCE_TYPES SOURCES URL ACTIVE_URLS ACTIVE_SOURCES_INFO TITLE_RAW BODY MODELED_EXPIRED MODELED_DURATION COMPANY COMPANY_NAME COMPANY_RAW COMPANY_IS_STAFFING EDUCATION_LEVELS EDUCATION_LEVELS_NAME MIN_EDULEVELS MIN_EDULEVELS_NAME MAX_EDULEVELS MAX_EDULEVELS_NAME EMPLOYMENT_TYPE EMPLOYMENT_TYPE_NAME MIN_YEARS_EXPERIENCE MAX_YEARS_EXPERIENCE IS_INTERNSHIP SALARY REMOTE_TYPE REMOTE_TYPE_NAME ORIGINAL_PAY_PERIOD SALARY_TO SALARY_FROM LOCATION CITY CITY_NAME COUNTY COUNTY_NAME MSA MSA_NAME STATE STATE_NAME COUNTY_OUTGOING COUNTY_NAME_OUTGOING COUNTY_INCOMING COUNTY_NAME_INCOMING MSA_OUTGOING MSA_NAME_OUTGOING MSA_INCOMING MSA_NAME_INCOMING NAICS2 NAICS2_NAME NAICS3 NAICS3_NAME NAICS4 NAICS4_NAME NAICS5 NAICS5_NAME NAICS6 NAICS6_NAME TITLE TITLE_NAME TITLE_CLEAN SKILLS SKILLS_NAME SPECIALIZED_SKILLS SPECIALIZED_SKILLS_NAME CERTIFICATIONS CERTIFICATIONS_NAME COMMON_SKILLS COMMON_SKILLS_NAME SOFTWARE_SKILLS SOFTWARE_SKILLS_NAME ONET ONET_NAME ONET_2019 ONET_2019_NAME CIP6 CIP6_NAME CIP4 CIP4_NAME CIP2 CIP2_NAME SOC_2021_2 SOC_2021_2_NAME SOC_2021_3 SOC_2021_3_NAME SOC_2021_4 SOC_2021_4_NAME SOC_2021_5 SOC_2021_5_NAME LOT_CAREER_AREA LOT_CAREER_AREA_NAME LOT_OCCUPATION LOT_OCCUPATION_NAME LOT_SPECIALIZED_OCCUPATION LOT_SPECIALIZED_OCCUPATION_NAME LOT_OCCUPATION_GROUP LOT_OCCUPATION_GROUP_NAME LOT_V6_SPECIALIZED_OCCUPATION LOT_V6_SPECIALIZED_OCCUPATION_NAME LOT_V6_OCCUPATION LOT_V6_OCCUPATION_NAME LOT_V6_OCCUPATION_GROUP LOT_V6_OCCUPATION_GROUP_NAME LOT_V6_CAREER_AREA LOT_V6_CAREER_AREA_NAME SOC_2 SOC_2_NAME SOC_3 SOC_3_NAME SOC_4 SOC_4_NAME SOC_5 SOC_5_NAME LIGHTCAST_SECTORS LIGHTCAST_SECTORS_NAME NAICS_2022_2 NAICS_2022_2_NAME NAICS_2022_3 NAICS_2022_3_NAME NAICS_2022_4 NAICS_2022_4_NAME NAICS_2022_5 NAICS_2022_5_NAME NAICS_2022_6 NAICS_2022_6_NAME
filter for those with salary info
Jobs_w_salary_info = Jobs.filter(
~(
Jobs["SALARY"].is_null() |
Jobs["SALARY_TO"].is_null() |
Jobs["SALARY_FROM"].is_null()
)
)
Jobs_w_salary_info.select(["SALARY_FROM", "SALARY_TO", "SALARY"]).describe()
| statistic | SALARY_FROM | SALARY_TO | SALARY |
|---|---|---|---|
| str | f64 | f64 | f64 |
| "count" | 30808.0 | 30808.0 | 30808.0 |
| "null_count" | 0.0 | 0.0 | 0.0 |
| "mean" | 96108.560277 | 139349.489938 | 117953.755031 |
| "std" | 38651.515935 | 57867.606046 | 45133.878359 |
| "min" | 10230.0 | 16640.0 | 15860.0 |
| "25%" | 66560.0 | 96471.0 | 84933.0 |
| "50%" | 90000.0 | 135000.0 | 116300.0 |
| "75%" | 119800.0 | 178200.0 | 145600.0 |
| "max" | 500000.0 | 500000.0 | 500000.0 |
Jobs_w_salary_info.shape
(30808, 131)
Jobs_w_salary_info.select(
pl.col("EMPLOYMENT_TYPE_NAME").value_counts()
)
| EMPLOYMENT_TYPE_NAME |
|---|
| struct[2] |
| {"Part-time (≤ 32 hours)",1038} |
| {"Full-time (> 32 hours)",29151} |
| {"Part-time / full-time",619} |
Jobs_w_salary_info.select(
pl.col("EMPLOYMENT_TYPE").value_counts()
)
| EMPLOYMENT_TYPE |
|---|
| struct[2] |
| {2,1038} |
| {1,29151} |
| {3,619} |
Jobs_w_salary_info.group_by("EMPLOYMENT_TYPE_NAME").agg([
pl.len().alias("Job_Count"),
pl.col("SALARY").mean().alias("Avg_Salary"),
pl.col("SALARY").median().alias("Median_Salary"),
pl.col("SALARY").min().alias("Min_Salary"),
pl.col("SALARY").max().alias("Max_Salary"),
pl.col("SALARY").std().alias("Salary_StdDev")
]).sort("Avg_Salary", descending=True)
| EMPLOYMENT_TYPE_NAME | Job_Count | Avg_Salary | Median_Salary | Min_Salary | Max_Salary | Salary_StdDev |
|---|---|---|---|---|---|---|
| str | u32 | f64 | f64 | i64 | i64 | f64 |
| "Full-time (> 32 hours)" | 29151 | 118897.558609 | 116500.0 | 20583 | 500000 | 44351.533443 |
| "Part-time / full-time" | 619 | 105621.242326 | 100000.0 | 20800 | 455375 | 52979.422642 |
| "Part-time (≤ 32 hours)" | 1038 | 98802.509634 | 86390.0 | 15860 | 310050 | 55382.720356 |
df_plot = Jobs_w_salary_info.select(["EMPLOYMENT_TYPE_NAME", "SALARY_FROM"]).to_pandas()
employment_types = df_plot["EMPLOYMENT_TYPE_NAME"].unique()
plot_01 = go.Figure()
for emp_type in employment_types:
plot_01.add_trace(
go.Box(
y=df_plot[df_plot["EMPLOYMENT_TYPE_NAME"] == emp_type]["SALARY_FROM"],
name=emp_type,
boxpoints='outliers',
marker_color='lightgray',
line_color='black',
showlegend=False
)
)
plot_01.add_trace(
go.Scatter(
x=df_plot["EMPLOYMENT_TYPE_NAME"],
y=df_plot["SALARY_FROM"],
mode='markers',
marker=dict(
color=df_plot["SALARY_FROM"],
colorscale='Viridis',
showscale=True,
size=6,
opacity=0.7,
colorbar=dict(title="Starting Salary")
),
name='Salary Points',
hoverinfo='x+y'
)
)
plot_01.update_layout(
title="Starting Salary Distribution by Employment Type",
xaxis_title="Employment Type",
yaxis_title="Starting Salary",
xaxis_tickangle=-45,
template='plotly_white',
yaxis=dict(
tick0=0,
dtick=50000,
gridcolor="lightgray"
),
margin=dict(t=80, b=100, l=80, r=40),
boxmode='group',
boxgap=0.4,
height=600,
width=1000
)
plot_01.show()
What this image shows is that employees in full-time roles generally earn more than those with part-time jobs. This result is also intuitive and makes logical sense. It is unclear what the category of ‘Part time / full-time’ represents
df_plot = Jobs_w_salary_info.select(["NAICS2_NAME", "SALARY_FROM"]).to_pandas()
industries = df_plot["NAICS2_NAME"].unique()
plot_02 = go.Figure()
for industry in industries:
plot_02.add_trace(
go.Box(
y=df_plot[df_plot["NAICS2_NAME"] == industry]["SALARY_FROM"],
name=industry,
boxpoints='outliers',
marker_color='lightgray',
line_color='black',
showlegend=False
)
)
plot_02.add_trace(
go.Scatter(
x=df_plot["NAICS2_NAME"],
y=df_plot["SALARY_FROM"],
mode='markers',
marker=dict(
color=df_plot["SALARY_FROM"],
colorscale='Turbo',
showscale=True,
size=6,
opacity=0.7,
colorbar=dict(title="Starting Salary")
),
name='Salary Points',
hoverinfo='x+y'
)
)
plot_02.update_layout(
title="Starting Salary Distribution by Industry (NAICS2_NAME)",
xaxis_title="Industry (NAICS2)",
yaxis_title="Starting Salary",
xaxis_tickangle=-45,
template='plotly_white',
yaxis=dict(
tick0=0,
dtick=50000,
gridcolor="lightgray"
),
margin=dict(
t=80,
b=120,
l=80,
r=40
),
boxmode='group',
boxgap=0.4,
height=700,
width=1200
)
plot_02.show()
The plot shows the STARTING_FROM salary for the various industries indicated in the NAICS2_NAME column. One can see some very high starting ‘from’ salaries in the administrative and support and waste management and remediation services category, and also health care and social assistance.
Jobs_with_dates = Jobs.with_columns(
pl.col("POSTED").cast(pl.Utf8).str.strptime(pl.Date, "%m/%d/%Y", strict=False).alias("POSTED_DATE")
)
daily_postings = (
Jobs_with_dates
.group_by("POSTED_DATE")
.agg(pl.len().alias("Post_Count"))
.sort("POSTED_DATE")
)
df_line = daily_postings.to_pandas()
df_line["Smoothed"] = df_line["Post_Count"].rolling(window=7, center=True).mean()
plot_03 = go.Figure()
# Raw daily line (gray)
plot_03.add_trace(go.Scatter(
x=df_line["POSTED_DATE"],
y=df_line["Post_Count"],
mode="lines",
name="Raw Daily Count",
line=dict(color="lightgray", width=2)
))
# Smoothed line (red)
plot_03.add_trace(go.Scatter(
x=df_line["POSTED_DATE"],
y=df_line["Smoothed"],
mode="lines",
name="7-Day Smoothed",
line=dict(color="red", width=3)
))
# Layout and formatting
plot_03.update_layout(
title="Job Postings Over Time (Raw + 7-Day Smoothed)",
xaxis_title="Date",
yaxis_title="Number of Postings",
template="plotly_white",
height=500,
width=1000,
xaxis_tickangle=-45,
margin=dict(t=60, b=100, l=60, r=40),
legend=dict(x=0.01, y=0.99, bordercolor="gray", borderwidth=1)
)
plot_03.show()
Jobs_with_dates.select([
pl.col("POSTED_DATE").min().alias("Earliest"),
pl.col("POSTED_DATE").max().alias("Latest")
])
| Earliest | Latest |
|---|---|
| date | date |
| 2024-05-01 | 2024-09-30 |
This shows the jobs posted per day in the dataset, indicating a general upward trend. While the top day of the week varies slightly from day to day the lowest days (amount posted) are, in general, always on Sundays.
top_titles = (
Jobs.group_by("TITLE_NAME")
.agg(pl.len().alias("Count"))
.sort("Count", descending=True)
.limit(10)
)
df_top_titles = top_titles.to_pandas()
plot_04 = px.bar(
df_top_titles,
x="TITLE_NAME",
y="Count",
color="TITLE_NAME", # Different color per title
title="Top 10 Job Titles by Number of Postings",
labels={"TITLE_NAME": "Job Title", "Count": "Number of Postings"},
template="plotly_white"
)
plot_04.update_layout(
xaxis_tickangle=-45,
height=500,
width=1000,
showlegend=False, # Optional: hide legend since x-axis already shows titles
margin=dict(t=60, b=100, l=60, r=40)
)
plot_04.show()
This bar chart is generally easy to read and shows the viewer that ‘Data Analysts’ is the top job title in the dataset, by a lot...
Jobs.select("REMOTE_TYPE_NAME").unique()
| REMOTE_TYPE_NAME |
|---|
| str |
| "Hybrid Remote" |
| "Not Remote" |
| "Remote" |
| null |
| "[None]" |
Jobs_cleaned_remote = Jobs.with_columns(
pl.when(
pl.col("REMOTE_TYPE_NAME").is_null() |
(pl.col("REMOTE_TYPE_NAME").cast(pl.Utf8).str.strip_chars("[]") == "None")
)
.then(pl.lit("Unknown"))
.otherwise(pl.col("REMOTE_TYPE_NAME"))
.alias("REMOTE_TYPE_NAME")
)
Jobs_cleaned_remote.select("REMOTE_TYPE_NAME").unique()
| REMOTE_TYPE_NAME |
|---|
| str |
| "Hybrid Remote" |
| "Remote" |
| "Not Remote" |
| "Unknown" |
remote_counts = (
Jobs_cleaned_remote
.group_by("REMOTE_TYPE_NAME")
.agg(pl.len().alias("Count"))
.sort("Count", descending=True)
)
df_remote = remote_counts.to_pandas()
plot_05 = px.pie(
df_remote,
names="REMOTE_TYPE_NAME",
values="Count",
title="Distribution of Remote Work Types",
color_discrete_sequence=px.colors.qualitative.Set3
)
plot_05.update_traces(textposition='inside', textinfo='percent+label')
plot_05.update_layout(
template="plotly_white",
margin=dict(t=60, b=60, l=60, r=60)
)
plot_05.show()
It appears that in this case a lot of the information is missing concerning whether a job posting has indicated if it is remote or not (with 'None' being the major category). For the purposes of the pie chart, I’m showing the main category of 'None' (along with the few NaN values in there) as Unknown.
Jobs.select("SKILLS_NAME").unique().shape
(44173, 1)
Jobs.select("SKILLS_NAME").unique().head(20)
| SKILLS_NAME |
|---|
| str |
| "[ "Management", "Coaching"… |
| "[ "Systems Development Life … |
| "[ "Business Objectives", "… |
| "[ "Presentations", "Office… |
| "[ "Ansys Simulation Software… |
| … |
| "[ "Management", "Microsoft… |
| "[ "Enterprise Resource Plann… |
| "[ "Workflow Management", "… |
| "[ "Bed Management", "Healt… |
| "[ "Mathematics", "Collecti… |
exploded = Jobs.with_columns(
pl.col("SKILLS_NAME").cast(pl.List(pl.Utf8))
).explode("SKILLS_NAME")
skills_per_industry = (
exploded
.group_by("NAICS_2022_6_NAME")
.agg(pl.len().alias("Skill_Count"))
.sort("Skill_Count", descending=True)
.limit(15)
)
df_industry_skills = skills_per_industry.to_pandas()
plot_06 = px.bar(
df_industry_skills,
x="NAICS_2022_6_NAME",
y="Skill_Count",
title="Top 15 Industries by Skill Mentions",
labels={
"NAICS_2022_6_NAME": "Industry",
"Skill_Count": "Total Skill Mentions"
},
template="plotly_white",
color="NAICS_2022_6_NAME",
color_discrete_sequence=px.colors.qualitative.Pastel
)
plot_06.update_layout(
xaxis_tickangle=-45,
height=700,
width=1200,
margin=dict(t=60, b=140, l=60, r=40),
showlegend=False
)
plot_06.show()
There are 44173 skills in the SKILLS_NAME column. To make this plot readable and help to understand the question I’m showing the top 15 industries by count of skill mentions
Jobs.select("ONET_NAME").unique()
| ONET_NAME |
|---|
| str |
| null |
| "Business Intelligence Analysts" |
Jobs_w_salary_info.select("ONET_NAME").unique()
| ONET_NAME |
|---|
| str |
| "Business Intelligence Analysts" |
Jobs_w_salary_info.filter(
pl.col("ONET_NAME") == "Business Intelligence Analysts"
).select(
pl.median("SALARY").alias("Median_Salary")
)
| Median_Salary |
|---|
| f64 |
| 116300.0 |
plot_07 = px.bar(
x=["Business Intelligence Analysts"],
y=[Jobs_w_salary_info.filter(pl.col("ONET_NAME") == "Business Intelligence Analysts")
.select(pl.median("SALARY"))[0, 0]],
labels={"x": "[ONET] Occupation", "y": "Median Salary"},
title="Median Salary: Business Intelligence Analysts",
template="plotly_white"
)
plot_07.update_traces(marker_color='teal')
plot_07.show()
The only ONET_NAME category, when filtered for salary information, is Business Intelligence Analysists. When not filtered for salary information it’s Business Intelligence Analysists and null.
Jobs.select("SOC_2021_2_NAME").unique()
| SOC_2021_2_NAME |
|---|
| str |
| null |
| "Computer and Mathematical Occu… |
Jobs.group_by("SOC_2021_2_NAME").agg(
pl.len().alias("Post_Count")
).sort("Post_Count", descending=True)
| SOC_2021_2_NAME | Post_Count |
|---|---|
| str | u32 |
| "Computer and Mathematical Occu… | 72454 |
| null | 44 |
Jobs.select("SOC_2021_3_NAME").unique()
| SOC_2021_3_NAME |
|---|
| str |
| "Mathematical Science Occupatio… |
| null |
Jobs.group_by("SOC_2021_3_NAME").agg(
pl.len().alias("Post_Count")
).sort("Post_Count", descending=True)
| SOC_2021_3_NAME | Post_Count |
|---|---|
| str | u32 |
| "Mathematical Science Occupatio… | 72454 |
| null | 44 |
transitions = (
Jobs.group_by(["SOC_2021_2_NAME", "SOC_2021_3_NAME"])
.agg(pl.len().alias("Count"))
.filter(pl.col("SOC_2021_2_NAME").is_not_null() & pl.col("SOC_2021_3_NAME").is_not_null())
)
df_transitions = transitions.to_pandas()
all_labels = pd.unique(
np.concatenate([
df_transitions["SOC_2021_2_NAME"].values,
df_transitions["SOC_2021_3_NAME"].values
])
)
label_to_index = {label: idx for idx, label in enumerate(all_labels)}
df_transitions["source"] = df_transitions["SOC_2021_2_NAME"].map(label_to_index)
df_transitions["target"] = df_transitions["SOC_2021_3_NAME"].map(label_to_index)
plot_08 = go.Figure(data=[go.Sankey(
node=dict(
pad=15,
thickness=20,
line=dict(color="black", width=0.5),
label=list(label_to_index.keys()),
color="lightblue"
),
link=dict(
source=df_transitions["source"],
target=df_transitions["target"],
value=df_transitions["Count"]
)
)])
plot_08.update_layout(
title_text="SOC Transitions: 2-Digit → 3-Digit Level",
font_size=12,
height=700,
width=1000
)
plot_08.show()
Same thing here it appears that both SOC_2021_2_NAME and SOC_2021_3_NAME contain the entire dataset, making a plot of this kind generally uninformative.